#!/usr/bin/env python
#-*-coding:utf-8-*-


path = "../../data/crf_ner_all.txt"

def split_to_train():
    sentences = []
    f = open(path)
    sent = []
    for i, line in enumerate(f):
        line = line.strip("\n")
        if not line:
            if sent:
                sentences.append(sent)
                sent = []
            continue
        sent.append(line)

    sent_count = len(sentences)
    train_count = sent_count * 0.8
    test_count = sent_count * 0.9
    train_f = open("../../data/ner/crf_ner_train.txt", 'w')
    dev_f = open("../../data/ner/crf_ner_dev.txt", 'w')
    test_f = open("../../data/ner/crf_ner_test.txt", 'w')

    print(sent_count)
    for i , sent in enumerate(sentences):
        if i <train_count:
            outf = train_f
        elif i < test_count :
            outf = test_f
        else:
            outf = dev_f

        for w in sent:
            outf.write(w + "\n")
        outf.write("\n")



split_to_train()